In [2]:
%pylab inline
from seqtools import *
from seqplot import *
from hla_prediction import *
from IPython.display import Image,HTML
import itertools
from myboxplot import *
from objhist import *


Populating the interactive namespace from numpy and matplotlib

References for the sequences I found:

"Antigenicity and immunogenicity of a novel, acute HIV-1 Tanzanian subtype C gp145 envelope protein for clinical development"

"A Next-Generation Cleaved, Soluble HIV-1 Env Trimer, BG505 SOSIP.664 gp140, Expresses Multiple Epitopes for Broadly Neutralizing but Not Non-Neutralizing Antibodies"

HBsAg


In [17]:
Image('Figure_S1-01.png')


Out[17]:

In [5]:
seqs = fasta2align('adjuvant_trial.fasta')
for name in seqs.index:
    print '>%s\n%s\n' % (name,seqs[name])


>gp120_6980.v0.c31
MRVRGILRNWQQWWIWGILGFWMVLICSGNLWVTVYYGVPVWREAKTTLFCASDAKAYEREVHNVWATHACVPTDPDPQEIFLGKNVTEKFNMWKNYMVDQMHEDIISLWDQSLQPCVKLTPLCITLNCTDVTAHNGNTVYDNNATVVNSTNEIKNCSFNITTELRDKRKKEHALFNNLDIVQLDGNSSLYRLINCNTSIIKQACPKISFDPIPIHYCAPAGFVILKCNNETFNGTGPCNNVSAVQCTHGIKPVVSTQLLLNGSLAKGEIMIRSENITDNVKTIIVHLNNSVEIVCTRPNNNTRKSIRIGPGQTFYATGDIIGDIRQAYCSINESNWNATLQRVSKKLAEHFPNKTIQFKSPSGGDLEITMHSFNCRGEFFYCNTSKLFNGTYYPNGTYYPNGTNSTLIIPCRIKQIINMWQGVGKAIYASPIAGNITCRSNITGLLLTRDGGDTNDTEIFRPAGGDMRDNWRSELYKYKIVEIKPLGVAPTEAKRRVVKREKRAVTIGAVFLGFLGAAGSTMGAASITLTVQARQLLSGIVQQQSNLLRAIEAQQHMLQLTVWGIKQLQARVLAIERCLKDQQLLGIWGCSGKLICTTAVPWNNSWSNRTQDEIWKNLTWMEWDREISNYTNTIYELLEVSQSQQERNEKDLLALDSWNNLWNWFDISNWLWYIKIFIMIVGGLIGLRIIFAVLSIVNRVRQGYSPLSFQTLIPNQREPDRPGRIEEEGGEQDKDRSIRLVSGFLALAWDDLRSLCIFLYHHLRDFILIAARATELLGRSSLRGLQRGWEALKYLGSLVQYWGLEIKKSAINLLDTIAIAVAEGTDRIIEIVQRACRAVLNIPRRIRQGLEAALQ

>BG505_SOSIP.664
MDAMKRGLCCVLLLCGAVFVSPSQEIHARFRRGARAENLWVTVYYGVPVWKDAETTLFCASDAKAYETEKHNVWATHACVPTDPNPQEIHLENVTEEFNMWKNNMVEQMHTDIISLWDQSLKPCVKLTPLCVTLQCTNVTNNITDDMRGELKNCSFNMTTELRDKKQKVYSLFYRLDVVQINENQGNRSNNSNKEYRLINCNTSAITQACPKVSFEPIPIHYCAPAGFAILKCKDKKFNGTGPCPSVSTVQCTHGIKPVVSTQLLLNGSLAEEEVMIRSENITNNAKNILVQFNTPVQINCTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHCNVSKATWNETLGKVVKQLRKHFGNNTIIRFANSSGGDLEVTTHSFNCGGEFFYCNTSGLFNSTWISNTSVQGSNSTGSNDSITLPCRIKQIINMWQRIGQAMYAPPIQGVIRCVSNITGLILTRDGGSTNSTTETFRPGGGDMRDNWRSELYKYKVVKIEPLGVAPTRCKRRVVGRRRRRRAVGIGAVFLGFLGAAGSTMGAASMTLTVQARNLLSGIVQQQSNLLRAPEAQQHLLKLTVWGIKQLQARVLAVERYLRDQQLLGIWGCSGKLICCTNVPWNSSWSNRNLSEIWDNMTWLQWDKEISNYTQIIYGLLEESQNQQEKNEQDLLALD

>HBsAg_AY646444
MENTTSGFLGPLLVLQAGFFLLTRILTIPQSLDSWWTSLNFLGGAPTCPGQNSQSPTSNHSPTSCPPTCPGYRWMCLRRFIIFLFILLLCLIFLLVLLDYQGMLPVCPLLPGTSTTSTGPCKTCTIPAQGTSMFPSCCCTKPSDGNCTCIPIPSSWAFARFLWEWASVRFSWLSLLVPFVQWFVGLSPTVWLSVIWMMWYWGPSLYNILSPFLPLLPIFFCLWVYI

>HBsAg_JF970209
MENTTSGFLGPLLVLQAGFFLLTRILTIPQSLDSWWTSLNFLGGAPTCLGQNSQSPTSNHSPTSCPPICPGYRWMCLRRFIIFLFILLLCLIFLLVLLDYQGMLPVCPLLPRTSTTSTGPCKTCTIPAQGTSMFPSCCCTKPSDGNCTCIPIPSSWAFARFLWEWASVRFSWLSLLVPFVQWFVGLSPTVWLSVIWMMWYWGPSLYNILSPFLPLLPIFFCLWVYI

Note the similarity between the two HBsAntigens


In [25]:
compSeq(seqs['HBsAg_AY646444'],seqs['HBsAg_JF970209'])


Pos 1 - 50
M E N T T S G F L G P L L V L Q A G F F L L T R I L T I P Q S L D S W W T S L N F L G G A P T C P G
                                                                                                |  
M E N T T S G F L G P L L V L Q A G F F L L T R I L T I P Q S L D S W W T S L N F L G G A P T C L G

Pos 51 - 100
Q N S Q S P T S N H S P T S C P P T C P G Y R W M C L R R F I I F L F I L L L C L I F L L V L L D Y
                                  |                                                                
Q N S Q S P T S N H S P T S C P P I C P G Y R W M C L R R F I I F L F I L L L C L I F L L V L L D Y

Pos 101 - 150
Q G M L P V C P L L P G T S T T S T G P C K T C T I P A Q G T S M F P S C C C T K P S D G N C T C I
                      |                                                                            
Q G M L P V C P L L P R T S T T S T G P C K T C T I P A Q G T S M F P S C C C T K P S D G N C T C I

Pos 151 - 200
P I P S S W A F A R F L W E W A S V R F S W L S L L V P F V Q W F V G L S P T V W L S V I W M M W Y
                                                                                                   
P I P S S W A F A R F L W E W A S V R F S W L S L L V P F V Q W F V G L S P T V W L S V I W M M W Y

Pos 201 - 226
W G P S L Y N I L S P F L P L L P I F F C L W V Y I
                                                   
W G P S L Y N I L S P F L P L L P I F F C L W V Y I

Seq1 (226) and Seq2 (226) are 98.7% similar

Analysis steps:

  1. Make two lists of all 9mers from the HIV Env and the HBsAg immunogens

  2. Compute all pairwise hamming distances between the two sets of 9mers

  3. Plot distribution of 9mer pairs with distances less than 6 mismatches


In [14]:
k = 9
Env_mers = unique(getMers(seqs['gp120_6980.v0.c31'],nmers=[k]) + getMers(seqs['BG505_SOSIP.664'],nmers=[k]))
HBsAg_mers = unique(getMers(seqs['HBsAg_AY646444'],nmers=[k]) + getMers(seqs['HBsAg_JF970209'],nmers=[k]))

In [15]:
merPairs = []
pwdist = zeros(len(Env_mers)*len(HBsAg_mers))
for i,(env_pep, hbv_pep) in enumerate(itertools.product(Env_mers,HBsAg_mers)):
    pwdist[i] = hamming_distance(env_pep,hbv_pep)
    merPairs.append([env_pep,hbv_pep])
merPairs = array(merPairs)

In [18]:
oh = objhist(pwdist,keys=range(6))
bar(height=[oh[i] for i in arange(6)],left=arange(6),color='gray', align='center')
ylabel('Number of peptide pairs\nw/ N mismatches')
_ = xlim((-1,6))
_ = ylim((0,50))
_ = xticks(arange(6))


Results

  • No 9mer pairs share more than 5 amino acids
  • There are 30 pairs of 9mer that share exactly 5 amino acids
  • Moving to 10mers decreases the number of pairs sharing 5 amin acids to 2: the overlap gets even lower for 12-15 AA peptides which would be more typical of a CD4+ T-cell epitope
  • Frahm et al. have data that show that T-cell specificity is typically lost after 2-3 mutations of an epitope
  • It is highly unlikely that HBsAg and HIV Env will have cross-reactive T-cell responses in this trial

In [17]:
for env_pep, hbv_pep in merPairs[pwdist<=4,:]:
    print env_pep
    print hbv_pep
    print


CTNVPWNSS
CTCIPIPSS

DGGSTNSTT
PGTSTTSTG

DLRSLCIFL
CLRRFIIFL

EPLGVAPTR
NFLGGAPTC

FLGFLGAAG
SLNFLGGAP

GGSTNSTTE
GTSTTSTGP

GILGFWMVL
GFLGPLLVL

IEPLGVAPT
LNFLGGAPT

IKPLGVAPT
LNFLGGAPT

KPLGVAPTE
NFLGGAPTC

LDSWNNLWN
LDSWWTSLN

LGFLGAAGS
LNFLGGAPT

LGVAPTEAK
LGGAPTCLG

LGVAPTEAK
LGGAPTCPG

LGVAPTRCK
LGGAPTCLG

LGVAPTRCK
LGGAPTCPG

LRSLCIFLY
LRRFIIFLF

PIAGNITCR
PSDGNCTCI

PLGVAPTEA
FLGGAPTCL

PLGVAPTEA
FLGGAPTCP

PLGVAPTRC
FLGGAPTCL

PLGVAPTRC
FLGGAPTCP

PSQEIHARF
PSSWAFARF

QGSNSTGSN
QNSQSPTSN

RDGGSTNST
LPGTSTTST

SNSTGSNDS
SQSPTSNHS

SPIAGNITC
KPSDGNCTC

STGSNDSIT
SPTSNHSPT

TNVPWNSSW
TCIPIPSSW

VFLGFLGAA
TSLNFLGGA

Shuffled HIV Env sequences

  • It seems obvious that there is no homology between the two proteins, but still there is a certain low level of overlap (30 9mers sharing 5 amino acids). Is this more than you'd expect with random amino acids?
  • My thinkin is that sometimes even non-homologous proteins probably share similar short amino-acid motifs (e.g. beta barrel or alpha helix backbones etc.), but that these would be destroyed with an amino-acid shuffle (though a shuffle preserves AA frequencies).
  • If you take the HIV Env immunogen and shuffle the amino acids one finds similar cross-reactivity with HBsAg
  • As it turned out this small level of cross-reactivity that we observe is characteristic of HBsAg and any random shuffled protein with the same length and AA makeup. I guess it’s a way of putting the result in context.
  • There are even a few cases of shuffled 9mers that differ by only 3 mismatches, which we never observe with the correctly ordered HIV Env imunogen

In [13]:
for rep in range(10):
    Env_mers = unique(getMers(''.join([seqs['gp120_6980.v0.c31'][i] for i in permutation(len(seqs['gp120_6980.v0.c31']))]),nmers=[k]))
    merPairs = []
    pwdist = zeros(len(Env_mers)*len(HBsAg_mers))
    for i,(env_pep, hbv_pep) in enumerate(itertools.product(Env_mers,HBsAg_mers)):
        pwdist[i] = hamming_distance(env_pep,hbv_pep)
        merPairs.append([env_pep,hbv_pep])
    merPairs = array(merPairs)
    
    oh = objhist(pwdist,keys=range(6))
    figure()
    bar(height=[oh[i] for i in arange(6)],left=arange(6),color='gray', align='center')
    ylabel('Number of peptide pairs\nw/ N mismatches')
    _ = xlim((-1,6))
    _ = ylim((0,50))
    _ = xticks(arange(6))